{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "\n",
    "import spacy\n",
    "nlp = spacy.load('en_coref_sm')             # 引用小的英语指代模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "True\n",
      "[My sister: [My sister, She], a dog: [a dog, him]]\n",
      "My sister has a dog. My sister loves a dog.\n"
     ]
    }
   ],
   "source": [
    "doc = nlp(u'My sister has a dog. She loves him.')\n",
    "\n",
    "print(doc._.has_coref)                      # 是否有指代？\n",
    "print(doc._.coref_clusters)                 # 共指链组成的列表\n",
    "print(doc._.coref_resolved)                 # 将代词用指代簇中的主要描述代替后的文本"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "下面的例子中我们可以获得更多详细信息。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "My sister has a dog. She loves him\n",
      "appearance of cluster 0: My sister\n",
      "\tMy sister at [0,9]\n",
      "\tShe at [21,24]\n",
      "appearance of cluster 1: a dog\n",
      "\ta dog at [14,19]\n",
      "\thim at [31,34]\n"
     ]
    }
   ],
   "source": [
    "doc = nlp(u'My sister has a dog. She loves him')\n",
    "print(doc.text)                       # 重新获得文本内容,string对象\n",
    "for clust in doc._.coref_clusters:\n",
    "    clust_id = clust.i                # 该指代在本文档中所有指代中的序号 \n",
    "    main_mention = clust.main         # 主描述，共指链中的代表词语\n",
    "    print(\"appearance of cluster %d: %s\" % (clust_id,main_mention))\n",
    "    for mention in clust.mentions:\n",
    "        print(\"\\t%s at [%d,%d]\" % (mention.text,mention.start_char,mention.end_char))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;这个例子揭示了共指消解中的3层的结构，Doc[文档]-cluster[共指链，代表同一实体的所有对象]-mention[共指链中的描述，实体的一个描述]。其中我们得到了共指链的代表词和每个个别描述在原文中的位置，这些将有助于我们还原文本或干一些别的事情（尽管已经有了coref_resolved，有时候我们还是需要做出一些特别的修改，后面的一篇文章中我就会用到它们。）。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;以上例子已经揭示了这个工具的主要功能。这里再提供一些细节，Document下面有3种结构，下面列出它们支持的方法。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\n",
      "[a dog, him]\n",
      "a dog\n",
      "a dog: [a dog, him]\n"
     ]
    }
   ],
   "source": [
    "doc = nlp(u'My sister has a dog. She loves him')\n",
    "\n",
    "clust1 = doc._.coref_clusters[1]                        # Cluster，一个共指链\n",
    "print(clust1.i)                                         # 该指代在本文档中所有指代中的序号 \n",
    "print(clust1.mentions)                                  # 本共指链中的所有对象\n",
    "print(clust1.main)                                      # 主描述，共指链中的代表词语\n",
    "print(clust1.mentions[-1]._.coref_cluster)              # _.coref_cluster【该对象所处的共指链】"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "True\n",
      "[a dog: [a dog, him]]\n",
      "My sister has a dog . She loves him \n"
     ]
    }
   ],
   "source": [
    "token = doc[-1]                                         # token，相当于一个词语\n",
    "print(token._.in_coref)                                 # 是否在至少一个共指链中\n",
    "print(token._.coref_clusters)                           # 包括这个token的所有共指链\n",
    "# 遍历所有token\n",
    "for tok in doc:\n",
    "    print(tok,end = \" \")\n",
    "print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "True\n",
      "a dog\n",
      "a dog: [a dog, him]\n"
     ]
    }
   ],
   "source": [
    "span = doc[-1:]                                         # span，doc中的一个部分\n",
    "print(span._.is_coref)                                  # 是否至少包含了一个共指对象\n",
    "print(span._.coref_cluster.main)\n",
    "print(span._.coref_cluster.main._.coref_cluster)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:py36]",
   "language": "python",
   "name": "conda-env-py36-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
